#@title Default title text
! git clone https://github.com/tayuny/Data_Viz_proj.git
#@title Default title text
! pip install geopandas
import altair as alt
import pandas as pd
import numpy as np
import geopandas as geop
from shapely.geometry import Point
import re
import json
import warnings
from io import BytesIO
import requests
warnings.filterwarnings('ignore')
#@title Default title text
c_color_theme = ["#9cceff", "#2794ff", "#138bff", "#0063c4", "#0059b0", "#004f9c", "#004589"]
d_color_theme2 = ["#634c53", "#4b87ff", "#f2e522", "#cdd7d9", "#b0b0ff", "#b0d8ff", "#f27d22", "#ffcb64"]
def general_theme(*args, **kwargs):
return {'width': 400, 'height': 500,
'config': {"range": {"category": d_color_theme2, "diverging": c_color_theme},
'legend': {'titleFontSize': 12, 'labelFontSize': 12, "strokeColor":'gray',
"fillColor":'#fefdee', "padding":12},
'axis': {'titleFontSize': 12, 'labelFontSize': 12},
"title":{"font": "Lato", "fontSize": 16, "color":"black"},
"header":{"titleFontSize":12}}}
alt.themes.register('general_theme', general_theme)
alt.themes.enable('general_theme')
#@title Default title text
full_data = pd.read_csv("Data_Viz_proj/data/full_data50.csv")
selected_columns = ['AIN', 'ZIPcode5', 'RollYear', 'CT','GeneralUseType', 'TotalValue', 'EffectiveYearBuilt', 'SQFTmain', 'Bedrooms', 'Bathrooms',
'unemployment', 'pop', 'gini', 'median_income','poverty_rate', 'pop75', 'Travel_Time701902', 'Travel_Time267800', 'TotalValue2017',
'TotalValue2016', 'TotalValue2015', 'TotalValue2014']
main_data = full_data[selected_columns]
main_data = main_data[main_data["median_income"] != "-"]
main_data["median_income"] = main_data["median_income"].astype(float)
plot_data = main_data.groupby("CT").median()
#@title Default title text
plot_data["med_inc_b"] = 0
over_all_mid_income = plot_data.groupby("CT").mean()["median_income"].median()
plot_data.loc[plot_data["median_income"] >= over_all_mid_income, "med_inc_b"] = 1
plot_data["CT"] = plot_data.index
plot_data["building_age"] = 2018 - plot_data["EffectiveYearBuilt"]
The main purpose of the project is to evaluate the effect from the traveling time to commercial districts to housing value. First of all, a clear definition of commercial district is necessary for this purpose.
#@title Default title text
nearest10 = list(main_data.groupby("CT").mean()["Travel_Time701902"].sort_values(ascending=True)[:10].index.astype(str))
usage_CT_data = ((main_data[(main_data["GeneralUseType"] != "(missing)") & (main_data["GeneralUseType"] != "Dry Farm")]
).groupby(["CT", "GeneralUseType"]).size() / \
main_data.groupby(["CT"]).size()
).to_frame(name="use_ratio").reset_index()
usage_CT_data["CT"] = usage_CT_data["CT"].astype(str)
neighbor_dist = list(usage_CT_data[usage_CT_data["GeneralUseType"] == "Commercial"].sort_values(
by="use_ratio", ascending=False)["CT"][:20])
alt.Chart(usage_CT_data[usage_CT_data["CT"].isin(neighbor_dist)]).mark_bar().encode(
x=alt.X('sum(use_ratio)', stack="normalize", axis=alt.Axis(title="Usage Ratio for specific buildings")),
y=alt.Y("CT", axis=alt.Axis(title="Census Tract ID")),
color=alt.Color("GeneralUseType", legend=alt.Legend(title=["Building", "Usage Type"]))
).properties(
width = 500,
height = 400,
title = {"text" : "Percentage of Building Usage Type in Each Neighborhood",
"subtitle" : ["20 Neighborhoods (with census tract number) which has highest ratio for commercial properties",
"Region 701902 is selected to represent the Commercial District"]}
)
In the above graph, we can discover the ratio of the general usage type of all the properties in given regions. We discover that cencus tract 701902 has both high number and proportion of commercial properties. Thus, I define census tract 701902 as commercial district in the following research. Given the fact that region 702300 which contains the most commercial properties are close to tract 701902 (less than 5 min), the commercial districts is concentrated in this case.
#@title Default title text
selected_columns = ['TotalValue', 'SQFTmain', 'Bedrooms', 'Bathrooms', 'unemployment', 'pop', 'gini',
'median_income', 'poverty_rate', 'pop75', 'Travel_Time701902', 'Travel_Time267800',
'CT', 'building_age']
geop_file = geop.read_file("Data_Viz_proj/data/full_sub_CL701902.geojson")
geop_file.loc[:, 'lon'] = geop_file.geometry.centroid.x
geop_file.loc[:, 'lat'] = geop_file.geometry.centroid.y
geop_sub = geop_file[geop_file["CT"] == 701902]
#@title Default title text
def plot_geo_from_file(center_CT, select_col, col_type, title_dict, legend_title, tooltip_col, center_color="black"):
'''
The function take the center census and select columns and
provide geographical visualization for them
Inputs:
center_CT: commercial census tract
select_cols: columns for visualization
col_type: the type of the selected columns
'''
new_selected = "properties." + select_col + ":Q"
save_dir = "Data_Viz_proj/data/full_sub_CL" + center_CT + ".geojson"
with open(save_dir) as f:
LA_shape_g = json.load(f)
data_geojson = alt.InlineData(values=LA_shape_g, format=alt.DataFormat(property='features',type='json'))
chart = alt.Chart(data_geojson).mark_geoshape(stroke="black").encode(
color=alt.Color(new_selected, type='quantitative', scale=alt.Scale(scheme="yellowgreenblue"), title=legend_title),
tooltip=tooltip_col
).properties(
width=500,
height=300,
title=title_dict
)
label = alt.Chart(geop_file[geop_file["CT"] == 701902]).mark_text(color=center_color, text=["Commercial", "District"]).encode(
latitude="lat",
longitude="lon",
)
return chart + label
tooltip_col = ["properties.CT:N", "properties.SQFTmain:Q", "properties.EffectiveYearBuilt:Q"]
title_dict = {"text": ["The Location of Commercial District in Santa Monica and the Corresponding Traveling Time to it"],
"subtitle": ["Commercial District of Santa Monica is Located along the coast, traveling time is shorter along the coast"]}
plot_geo_from_file("701902", "Travel_Time701902", "Q", title_dict , ["Traveling Time", "To Commercial", "District"], tooltip_col)
#@title Default title text
com_per = pd.DataFrame(np.percentile(main_data[main_data["GeneralUseType"] == "Commercial"]["TotalValue"], np.arange(1,99,1)), columns=["Property_Value"])
com_per["Use_Type"] = "Commercial"
com_per["percentile"] = com_per.index
ind_per = pd.DataFrame(np.percentile(main_data[main_data["GeneralUseType"] == "Industrial"]["TotalValue"], np.arange(1,99,1)), columns=["Property_Value"])
ind_per["Use_Type"] = "Industrial"
ind_per["percentile"] = com_per.index
res_per = pd.DataFrame(np.percentile(main_data[main_data["GeneralUseType"] == "Residential"]["TotalValue"], np.arange(1,99,1)), columns=["Property_Value"])
res_per["Use_Type"] = "Residential"
res_per["percentile"] = res_per.index
total_per = pd.concat([res_per, com_per], join="inner")
total_per = pd.concat([total_per, ind_per], join="inner")
#@title Default title text
accu = alt.Chart(total_per).mark_area(opacity=0.3).encode(
x=alt.X("percentile:O", title="Percentile of Each Property Type"),
y=alt.Y("Property_Value:Q", title="Property Value of the Specific Percentile (Black Line: median property value)"),
color=alt.Color("Use_Type:N", title="Property Type"),
opacity=alt.value(0.8)
).properties(
width = 600,
height = 400,
title={"text": "Property Value of Each Percentile with Different Building Type",
"subtitle": ["The property value increase exponentially at the higher percentile, the distribution is unbalance",
"Buildings with higher profitability (Commercial and Industrial) have higher value and skewedness"]}
)
global_median = alt.Chart(total_per).mark_rule(color='black').encode(
y = alt.Y('median(Property_Value):Q'),
)
accu + global_median
#@title Default title text
MT_data = pd.DataFrame(columns=full_data.columns)
for c in pd.unique(full_data["CT"]):
sub_data = full_data[full_data.CT == c].reset_index(drop=True)
sub_size = sub_data.shape[0]
sel_idx = np.random.permutation(sub_size)[:int((sub_size * 0.05) // 1)]
MT_data = pd.concat([MT_data, sub_data.iloc[sel_idx]])
allCT = pd.unique(full_data["CT"])
geopf = geop.read_file("Data_Viz_proj/data/full_sub_CL701902.geojson")
sub_full = geopf[geopf["CT"].isin(allCT)]
sub_full.to_file("Data_Viz_proj/data/full_sub_CL701902_part.geojson", driver='GeoJSON')
#@title Default title text
save_dir = "Data_Viz_proj/data/full_sub_CL701902_part.geojson"
with open(save_dir) as f:
LA_shape_g = json.load(f)
data_geojson = alt.InlineData(values=LA_shape_g, format=alt.DataFormat(property='features',type='json'))
background = alt.Chart(data_geojson).mark_geoshape(color="#f6f3f4").encode().properties(
width=600,
height=400
)
chart = alt.Chart(MT_data[MT_data.GeneralUseType != "(missing)"]
).mark_circle(size=25).encode(
longitude='CENTER_LON:Q',
latitude='CENTER_LAT:Q',
color=alt.Color("GeneralUseType", title="Building Type"),
opacity = alt.value(0.8)
).properties(
width=600,
height=400,
title={"text": "The Sampled Locations for Properties in Different Usage Type",
"subtitle": ["residential properties locate in the specific blocks and",
"commercial properties tend to locate along the main roads and commercial district"]}
)
background + chart
#@title Default title text
chart_SQFTmain = alt.Chart(plot_data).mark_rect().encode(
x=alt.X('unemployment:Q', axis=alt.Axis(title="Unemployment Rate in the Neighborhood"), bin=alt.Bin(maxbins=20) ),
y=alt.Y('median_income:Q', axis=alt.Axis(title="Median Income in the Neighborhood"), bin=alt.Bin(maxbins=20)),
color=alt.Color('TotalValue:Q', title="Property Value")
)
circ = chart_SQFTmain.mark_point().encode(
alt.ColorValue('grey'),
alt.Size('pop',
legend=alt.Legend(title=['Population in ', "the Region"])
),
tooltip=alt.Tooltip('CT:N', title='Neighborhood')
)
(chart_SQFTmain + circ).properties(
width = 500,
height = 400,
title={"text": "Relation Between Median Income, Unemployment Rate, Population and Property Value",
"subtitle": ["In general, neighborhood with higher median income tend to have higher median property value",
"the main commercial district (blue cell at the right bottom) is surprisingly characterized with",
"high property values, low median income and high unemployment rate"]}
)
#@title Default title text
tooltip_col = ["properties.CT:N", "properties.SQFTmain:Q", "properties.EffectiveYearBuilt:T"]
title_dict = {"text": ["The Distribution of Median Property Value Among Neighborhoods in Santa Monica"],
"subtitle": ["the median property value in the commercial district is high because of its profitability",
", where most of the industrial and commercial properties located"]}
plot_geo_from_file("701902", "TotalValue", "Q", title_dict , ["Median Property", "Value in the", "Region"], tooltip_col, "white")
#@title Default title text
tooltip_col = ["properties.CT:N", "properties.poverty_rate:N", "properties.pop:Q"]
title_dict = {"text": ["The Distribution of Median Income Among Neighborhoods in Santa Monica"],
"subtitle": ["the median income of the neighborhoods are low in the center of the city (commercial district)",
"and it is relatively high in the sub-urban area"]}
plot_geo_from_file("701902", "median_income", "Q", title_dict , ["Median Income", "in the", "Region"], tooltip_col)
#@title Default title text
sqft_perc = np.percentile(main_data["SQFTmain"], np.arange(0,100,1))
main_data["SQFT_group"] = 0
for i, q in enumerate(sqft_perc[:-1]):
main_data.loc[(main_data["SQFTmain"] > sqft_perc[i + 1]) & (main_data["GeneralUseType"] == "Residential"), "SQFT_group"] = i + 1
rep_main = main_data.groupby("SQFT_group").median().reset_index()
#@title Default title text
rel1 = alt.Chart(rep_main[rep_main["SQFT_group"] <= 95]).mark_circle().encode(x=alt.X('SQFTmain:Q', axis=alt.Axis(title="Square Footage of a Building")),
y=alt.Y('TotalValue:Q', axis=alt.Axis(title="Approximated Property Value (Black line: median)")),
color=alt.Color("Travel_Time701902:Q", title= ["Traveling Time", "to Commercial","District"]),
size=alt.Size("TotalValue", title="Property Value")
).properties(
width = 600,
height = 400,
title = {"text": ["Relation between Housing Value, Housing Square Footage, and Traveling Time to Commercial Districts",
"for Residential Properties"],
"subtitle": ["Linear relation between building square footage and the property value is clear until a certain level, ",
"Buildings with higher square footage and higher value tend to have higher traveling time to the city center"]}
)
global_median = alt.Chart(rep_main).mark_rule(color='black').encode(
y=alt.Y('median(TotalValue):Q'),
)
rel1 + global_median
Note: House Value labelled in the graph is approximated by the median house value of given neighborhood
#@title Default title text
sqft_perc = np.percentile(main_data["SQFTmain"], np.arange(0,100,1))
main_data["SQFT_group"] = 0
for i, q in enumerate(sqft_perc[:-1]):
main_data.loc[(main_data["SQFTmain"] > sqft_perc[i + 1]) & (main_data["GeneralUseType"] == "Commercial"), "SQFT_group"] = i + 1
rep_main = main_data.groupby("SQFT_group").median().reset_index()
#@title Default title text
rel2 = alt.Chart(rep_main[rep_main["SQFT_group"] <= 95]).mark_circle().encode(x=alt.X('SQFTmain:Q', axis=alt.Axis(title="Square Footage of a Building")),
y=alt.Y('TotalValue:Q', axis=alt.Axis(title="Total Value of a Building (Black line: median)")),
color=alt.Color("Travel_Time701902:Q", title= ["Traveling Time", "to Commercial","District"]),
size=alt.Size("TotalValue", title="Property Value")
).properties(
width = 600,
height = 400,
title = {"text": ["Relation between Housing Value, Housing Square Footage, and Traveling Time to Residential Districts",
"for Commercial Properties"],
"subtitle": ["The linear relation between these characteristics for the residential properties is not observed in",
"the case of commercial properties"]}
)
global_median = alt.Chart(rep_main).mark_rule(color='black').encode(
y=alt.Y('median(TotalValue):Q'),
)
rel2 + global_median
Note: House Value labelled in the graph is approximated by the median house value of given Neighborhood
#@title Default title text
def create_yearly_table(grouped_data, selected_columns, year_list,
group_col="CT", group_thres=np.arange(0,1,0.2)):
'''
The function take the summarized table with the selected columns and year list
and provide information of the time trend for different sub-group specified in
group_col parameter
Inputs:
grouped_data: summarized table for census tracts
selected_columns: columns to be kept in the final table
year_list: The tax year for House Value included
group_col: columns for creating sub-groups
group_thres: percentile taken to group the group_col
Return: summarized table grouped by group_col and year
'''
final_df = grouped_data[selected_columns]
final_df["year"] = 2018
if group_col != "CT":
for i, thres in enumerate(group_thres):
final_df.loc[final_df[group_col] >= final_df[group_col].quantile(
thres), group_col + "_b"] = i * 20
for year in year_list:
temp = grouped_data
valcol = "TotalValue" + str(year)
temp["TotalValue"] = temp[valcol]
temp = temp[selected_columns]
temp["year"] = year
if group_col != "CT":
for i, thres in enumerate(group_thres):
temp.loc[temp[group_col] >= temp[group_col].quantile(
thres), group_col + "_b"] = i * 20
final_df = pd.concat([final_df, temp], join="inner")
return final_df
#@title Default title text
plot_data["building_age"] = 2018 - plot_data["EffectiveYearBuilt"]
selected_columns = ['AIN', 'ZIPcode5', 'RollYear', 'TotalValue', 'EffectiveYearBuilt',
'SQFTmain', 'Bedrooms', 'Bathrooms', 'unemployment', 'pop', 'gini',
'median_income', 'poverty_rate', 'pop75', 'Travel_Time701902',
'Travel_Time267800', 'building_age']
final_plot_data = create_yearly_table(plot_data, selected_columns, [2014, 2015, 2016,2017])
#@title Default title text
step = 60
overlap = 1
alt.Chart(final_plot_data, height=step, width=600).transform_joinaggregate(
median_value="median(TotalValue)", groupby=["year"]
).transform_bin(
['bin_max', 'bin_min'], 'TotalValue'
).transform_aggregate(
value='count()', groupby=["year", "median_value", "bin_min", "bin_max"]
).transform_impute(
impute="value", groupby=["year", "median_value"], key="bin_min", value=0
).mark_area(
interpolate='monotone',
fillOpacity=0.8,
stroke='lightgray',
strokeWidth=0.5
).encode(
alt.X('bin_min:Q', bin="binned", title='Property Values between 2014 to 2018'),
alt.Y(
'value:Q',
scale=alt.Scale(range=[step, -step * overlap]),
axis=None
),
alt.Fill(
'median_value:Q',
legend=alt.Legend(title=["Median Property", "Value of", "Communities"])
)
).facet(
row=alt.Row(
'year:N',
title=None,
header=alt.Header(labelAngle=0, labelAlign='right')
)
).properties(
title= {"text": 'The Distribution of Community-Level Median Property Value Between 2014 - 2018',
"subtitle": ["the median property values vary toward the right, which means that the increse is driven by the",
"middle and high valued properties and the low-valued properties remain"]},
bounds='flush'
).configure_facet(
spacing=0
).configure_view(
stroke=None
).configure_title(
anchor='end'
)